NMF on Gene expression and Chromatin accessibility data

# Helper functions 
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                            Compute K stats and                             ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Estimate K stats
my.kstats <- function(NMFexperiment){
  # calc different k stats
  NMFexperiment <- computeFrobErrorStats(NMFexperiment)
  NMFexperiment <- computeSilhoutteWidth(NMFexperiment)
  NMFexperiment <- computeCopheneticCoeff(NMFexperiment)
  NMFexperiment <- computeAmariDistances(NMFexperiment)
  return(NMFexperiment)
}

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                            Plot K stats                                    ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
my.plotKstats <- function(NMFexperiment, title){
  # visualize k stats
  gg.optKr <- plotKStats(NMFexperiment)
  gg.optKr <- gg.optKr + theme_bw() + 
    ggtitle(title) +
    theme(plot.title=element_text(hjust=0.5))
  return(gg.optKr)
}


##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                          Plot H matrix heatmap                             ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

my.plotHMatrices <- function(NMFexperiment, heat.anno, col=viridis(n=100)){
  for(ki in names(NMFexperiment@HMatrixList)) {
  cat("\n")
  cat("  \n#### H matrix for k=",  ki, "  \n  ")
  #plot H matrix
  tmp.hmatrix <- HMatrix(NMFexperiment, k = ki)
  colnames(tmp.hmatrix) <- colnames(NMFexperiment)
  h.heatmap <- Heatmap(tmp.hmatrix,
                       col = col,
                       name = "Exposure",
                       clustering_distance_columns = 'pearson',
                       show_column_dend = TRUE,
                       heatmap_legend_param = 
                         list(color_bar = "continuous", legend_height=unit(2, "cm")),
                       top_annotation = heat.anno,
                       show_column_names = TRUE,
                       show_row_names = FALSE,
                       cluster_rows = FALSE)
  print(h.heatmap)
  }
}
#my.plotHMatrices(rna.norm.nmf.exp, heat.anno)
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                      Recovery plots functions                              ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

auc <- function(rnk.list,max=NULL) {
  aux = sapply(rnk.list,function(rnk) {
    if (is.null(max)) {max = max(rnk)} 
    rnk = sort(rnk)
    X = 0
    i = 1
    ngenes = length(rnk)
    while ((rnk[i] <= max) && (i <= length(rnk))) {X = X + max -rnk[i];i = i+1}
    rauc = X/(i-1)/max
    return(rauc)
  })
  return(aux)
}

roc <- function(rnk.list,max=NULL,title=NULL) {
  require(RColorBrewer)
  col = brewer.pal(length(rnk.list),'Set1')
  rnk = c(1,rnk.list[[1]])
  if (is.null(max)) {max = max(rnk)} else {rnk=c(rnk,max)}
  plot(rnk,(1:length(rnk))/length(rnk),type='s',col=col[1],lwd=3,main=title,ylab='',xlab='Ranks')
  for (i in 2:length(rnk.list)) {
    rnk = c(1,rnk.list[[i]])
    if (is.null(max)) {max = max(rnk)} else {rnk=c(rnk,max)}
    lines(rnk,(1:length(rnk))/length(rnk),type='s',col=col[i],lwd=3)
  }
  L = length(rnk.list[[1]])
  abline(1/L,(1-1/L)/(max),lty=2,lwd=2,col='darkgrey')
  legend('bottomright',legend = names(rnk.list),col=col,lwd=3)
}

recovery_plot <- function(h, annot, annotID){
  which.a = annotID
  annot.factor <- annot[,annotID]
  
  n.samples = nrow(annot)
  
  ALL.RNKS = lapply(levels(annot.factor),function(l) {
  RNKS=lapply(1:nrow(h),function(i) {
    exp = sort(h[i,],decreasing=TRUE)
    i.rnk = match(rownames(annot)[annot.factor==l],names(exp))
    i.rnk = sort(i.rnk[!is.na(i.rnk)])
    return(i.rnk)
  })
  names(RNKS) = paste0('Sig ',1:length(RNKS))
  return(RNKS)
  })
    names(ALL.RNKS) = levels(annot.factor)
    
    AUC.RAND = lapply(ALL.RNKS,function(r) {
    do.call('rbind',lapply(r, function(x) {
      ##
      l = lapply(1:500,function(i) {
        sample(1:n.samples,length(x))
      })
      aux = auc(l,max=n.samples)
      return(c(mean(aux),sd(aux)))
    }))
      })
  
  AUC = lapply(ALL.RNKS,auc,max=n.samples)
  
  
  PVAL = lapply(1:length(AUC),function(i) {
    x = data.frame(AUC.RAND[[i]],AUC[[i]])
    colnames(x) = c('mean','sd','val')
    z = (x[,3]-x[,1])/x[,2]
    p = ifelse(z>0,pnorm(z,lower.tail=FALSE),pnorm(z))
    x$z = z
    x$p = p
    return(x)
  })
  names(PVAL) = names(AUC)
  
  
  for (n in names(ALL.RNKS)) {
    cat("\n")
    cat("  \n##### ",  n, "  \n  ")
    #print(n)
    RNKS = ALL.RNKS[[n]]
    names(RNKS) = paste0(names(RNKS),' - Pval = ',sprintf('%.1e',PVAL[[n]][,5]))
    roc(RNKS,max=n.samples,title=paste0(annotID,' - level : ',n))
    
  }
}


##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                    Plot H matrix heatmap integrative NMF                   ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##


plotH_oneView <- function(int_nmf, viewID, k, heat.anno, col=viridis(100), 
                          scale_color=TRUE, displayID = viewID){
  # Find which iteration returned the best results
  
  k <- paste0("k", k)
  top_idx <- int_nmf@best_factorization_idx
  idx <- top_idx[match(k, names(top_idx))]
  
  sharedH <- int_nmf@shared_HMatrix_list[[k]][[idx]]
  Hview   <- int_nmf@view_specific_HMatrix_list[[k]][[idx]][[viewID]]
  
  # Define total H matrix
  totalH <- sharedH + Hview
  # Color Function
  if (scale_color) {
    colf <- circlize::colorRamp2(seq(0, max(totalH), length.out = 100), col)
  } else {
    colf <- col
  }
  
  main_hist <- hclust(as.dist(1 - cor(totalH, method = "pearson")))
  
  tH.heatmap <- Heatmap(totalH,
                        col = colf,
                        name = "Total Exposure",
                        column_title = "Total H matrix",
                        cluster_columns = main_hist,
                        show_column_dend = TRUE,
                        top_annotation = heat.anno, 
                        show_column_names = FALSE,
                        show_row_names = FALSE,
                        cluster_rows = FALSE)
  
  sH.heatmap <- Heatmap(sharedH,
                        col = colf,
                        name = "Shared Exposure",
                        column_title = "Shared H matrix",
                        cluster_columns = main_hist,
                        show_column_dend = TRUE,
                        top_annotation = heat.anno,
                        show_column_names = FALSE,
                        show_row_names = FALSE,
                        cluster_rows = FALSE)
  
  vH.heatmap <- Heatmap(Hview,
                        col = colf,
                        name = "View Specific Exposure",
                        column_title = "View specific H matrix",
                        cluster_columns = main_hist,
                        show_column_dend = TRUE,
                        top_annotation = heat.anno,
                        show_column_names = FALSE,
                        show_row_names = FALSE,
                        cluster_rows = FALSE)
  #print(tH.heatmap + sH.heatmap + vH.heatmap)
  
  
  ht_global_opt(heatmap_legend_grid_height = unit(.25, "cm"))
  ht_list <- tH.heatmap + sH.heatmap + vH.heatmap
  draw(ht_list, row_title = displayID)
  ht_global_opt(RESET = TRUE)
}

Gene expression (RNAseq)

Data loading

Read normalized gene expression matrix…

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                         Read normalized data                               ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
# read normalized matrix
rna.norm.mat <- readRDS("data/rnaseq/rnaseq_normalized_counts.RDS")
rna.annotation <- readRDS("data/rnaseq/rnaseq_annotation.RDS")

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                          Print dataset dimension                           ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

cat("Dimension of transcriptome dataset (RNAseq):  \n\n  ") 

Dimension of transcriptome dataset (RNAseq):

kable(data.frame(dim(rna.norm.mat), row.names = c("features", "samples")), 
      col.names = "") 
features 21811
samples 45

Applying NMF

Applying Non-Negative Matrix Factorization (NMF) to normalized transcriptome data (RNAseq)

Factorization parameters:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Parameters to run NMF in GPUs using  pythonCuda               ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
k.min <- 8
k.max <- 10
outer.iter <- 10
inner.iter <- 2*10^4

kable(data.frame(c(k.min, k.max, outer.iter, inner.iter), 
                 row.names = c("k.min", "k.max", "outer.iter", "inner.iter")), 
      col.names = "") 
k.min 8
k.max 10
outer.iter 10
inner.iter 20000
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Create nmf experiment object and run NMF                      ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
rna.nmf.exp <- nmfExperimentFromMatrix(matrix = rna.norm.mat)


rna.nmf.exp <- runNMFtensor(rna.nmf.exp, 
                            k.min = k.min, 
                            k.max = k.max, 
                            outer.iter = outer.iter, 
                            inner.iter = inner.iter, 
                            conver.test.stop.threshold = 40)
## [1] "2019-04-02 10:15:40 UTC"
## Factorization rank:  8 
##  Iteration:  10 
## [1] "NMF converged after  253,131,118,268,199,215,228,189,275,356 iterations"
## [1] "2019-04-02 10:16:04 UTC"
## Factorization rank:  9 
##  Iteration:  10 
## [1] "NMF converged after  270,159,161,192,165,201,306,203,136,149 iterations"
## [1] "2019-04-02 10:16:21 UTC"
## Factorization rank:  10 
##  Iteration:  10 
## [1] "NMF converged after  163,215,266,267,240,125,135,155,229,270 iterations"
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                               Normalize NMF                                ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
rna.norm.nmf.exp <- normalizeW(rna.nmf.exp)

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                      K stats and normalization                             ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Estimate K stats
rna.norm.nmf.exp <- my.kstats(rna.norm.nmf.exp)

Factorization quality metrics and optimal K

Based on the results of the factorization quality metrics, an optimal number of signatures (k) must be chosen:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                            Plot K stats                                    ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
gg.optKr <- my.plotKstats(rna.norm.nmf.exp, "NMF factorization quality metrics")
gg.optKr

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                        Generate river plot                                 ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
river <- generateRiverplot(rna.norm.nmf.exp)
plot(river, plot_area=1, yscale=0.6, nodewidth=0.5)

Minize the Frobenius error, the coefficient of variation and the mean Amari distance, while maximizing the sum and mean silhouette width and the cophenic coefficient.

H Matrix, W normalized:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                        H matrix heatmap annotation                         ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
#Annotation for H matrix heatmap
# Define color vector
type.colVector <- list(Celltype = setNames(rna.annotation$color[match(levels(rna.annotation$Celltype), rna.annotation$Celltype)],
         levels(rna.annotation$Celltype))
)

# Build Heatmap annotation
heat.anno <- HeatmapAnnotation(df = data.frame(Celltype = rna.annotation$Celltype),
                               col = type.colVector,
                               show_annotation_name = TRUE, na_col = "white")


##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Generate H matrix heatmap, W normalized                       ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

for(ki in names(rna.norm.nmf.exp@HMatrixList)) {
  cat("\n")
  cat("  \n#### H matrix for k=",  ki, "  {.tabset}   \n  ")
  #plot H matrix
  tmp.hmatrix <- HMatrix(rna.norm.nmf.exp, k = ki)
  colnames(tmp.hmatrix) <- colnames(rna.norm.nmf.exp)
  h.heatmap <- Heatmap(tmp.hmatrix,
                       col = viridis(100),
                       name = "Exposure",
                       clustering_distance_columns = 'pearson',
                       show_column_dend = TRUE,
                       heatmap_legend_param = 
                         list(color_bar = "continuous", legend_height=unit(2, "cm")),
                       top_annotation = heat.anno,
                       show_column_names = FALSE,
                       show_row_names = FALSE,
                       cluster_rows = FALSE)
  print(h.heatmap)
  
  cat("  \n Recovery plots for k=",  ki, "  \n  ")
  
  recovery_plot(tmp.hmatrix, rna.annotation, "Celltype")
  
  }

H matrix for k= 8


Recovery plots for k= 8

Bcell

CD4

CD8

CLP

CMP

GMP

HSC

LMPP

MEP

Mono

MPP

NK

H matrix for k= 9


Recovery plots for k= 9

Bcell

CD4

CD8

CLP

CMP

GMP

HSC

LMPP

MEP

Mono

MPP

NK

H matrix for k= 10


Recovery plots for k= 10

Bcell

CD4

CD8

CLP

CMP

GMP

HSC

LMPP

MEP

Mono

MPP

NK

Gene set enrichment analysis

Using the feature exposure extracted from the W matrix, a gene set enrichment analysis is perform agains all MSigDB terms

The optimal factorization rank selected was: K = 9

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                              W matrix Z scores                             ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
rna.wmatrix <- WMatrix(rna.norm.nmf.exp, k = 9)
rownames(rna.wmatrix) <- rownames(rna.norm.nmf.exp)

#Zscore for each signature
rna.wmatrix.zscores <- apply(rna.wmatrix, MARGIN=2, function(wmat_score){
  (wmat_score - median(wmat_score)) / mad(wmat_score)
})
colnames(rna.wmatrix.zscores) <- paste0("Zscore_Sign", 1:9)


##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##         GAGE (Generally Applicable Gene-set Enrichment) analysis           ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
#Infer gene sets tha are significantly pertubed relative to all genes considered
#load precompiled GSEA MSigDB gene sets
gs.msigDB <- readList("db/msigdb.v6.2.symbols.gmt")
# head(gs.msigDB)

#run GAGE analysis
rna.msigDB.enrichment <- gage(rna.wmatrix.zscores, gsets=gs.msigDB, same.dir=TRUE)

#Drop NAs for upregulated
rna.msigDB.enrichment <- as.data.frame(rna.msigDB.enrichment$greater)
rna.msigDB.enrichment <- rna.msigDB.enrichment[!is.na(rna.msigDB.enrichment$p.geomean),]
rna.msigDB.enrichment <- rna.msigDB.enrichment[, paste0("Zscore_Sign", 1:9)]

# Select only more enriched terms in one signature compared to the others
idx <- apply(rna.msigDB.enrichment, 1, function(term){
  term <- -log10(term)
  # Change 0 to small value to avoid NAs
  term[term == 0] <- 1e-40
  # find if this term is more enriched in one signature compared to others
  is.enrich <- sapply(term, function(x){
    # p-value 5 times greater than at least 5 other signatures
    sum(x/term > 5) > 5
  })
  any(is.enrich)
})

rna.msigDB.enrichment <- rna.msigDB.enrichment[idx,]

# Print table
datatable(rna.msigDB.enrichment, filter="top",
          extensions = 'Buttons',
          options = list(dom = 'Bfrtip',
                         buttons = list(list(extend = 'collection',
                                             buttons = c('excel', 'csv'),
                                             text = 'DOWNLOAD DATA')))) %>%
  formatSignif(columns=colnames(rna.msigDB.enrichment), digits=3)

Chromatin accessibility (ATACseq)

Data loading

Read normalized chromatin accessibility matrix…

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                         Read normalized data                               ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
# read normalized matrix
atac.norm.mat <- readRDS("data/atacseq/atacseq_normalized_counts.RDS")
atac.annotation <- readRDS("data/atacseq/atacseq_annotation.RDS")

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                          Print dataset dimension                           ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

cat("Dimension of Chromatin accessibility dataset (ATACseq):  \n\n  ") 

Dimension of Chromatin accessibility dataset (ATACseq):

kable(data.frame(dim(atac.norm.mat), row.names = c("features", "samples")), 
      col.names = "") 
features 123102
samples 68

Applying NMF

Applying Non-Negative Matrix Factorization (NMF) to normalized Chromatin accessibility data (ATACseq)

Factorization parameters:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Parameters to run NMF in GPUs using  pythonCuda               ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
k.min <- 8
k.max <- 10
outer.iter <- 5
inner.iter <- 2*10^4

kable(data.frame(c(k.min, k.max, outer.iter, inner.iter), 
                 row.names = c("k.min", "k.max", "outer.iter", "inner.iter")), 
      col.names = "") 
k.min 8
k.max 10
outer.iter 5
inner.iter 20000
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Create nmf experiment object and run NMF                      ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
atac.nmf.exp <- nmfExperimentFromMatrix(matrix = atac.norm.mat)


atac.nmf.exp <- runNMFtensor(atac.nmf.exp, 
                            k.min = k.min, 
                            k.max = k.max, 
                            outer.iter = outer.iter, 
                            inner.iter = inner.iter, 
                            conver.test.stop.threshold = 40)
## [1] "2019-04-02 10:18:06 UTC"
## Factorization rank:  8 
## [1] "NMF converged after  196,195,123,307,301 iterations"
## [1] "2019-04-02 10:18:50 UTC"
## Factorization rank:  9 
## [1] "NMF converged after  437,255,492,210,155 iterations"
## [1] "2019-04-02 10:19:58 UTC"
## Factorization rank:  10 
## [1] "NMF converged after  248,190,161,202,234 iterations"
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                               Normalize NMF                                ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
atac.norm.nmf.exp <- normalizeW(atac.nmf.exp)

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                      K stats and normalization                             ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Estimate K stats
atac.norm.nmf.exp <- my.kstats(atac.norm.nmf.exp)

Factorization quality metrics and optimal K

Based on the results of the factorization quality metrics, an optimal number of signatures (k) must be chosen:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                            Plot K stats                                    ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
gg.optKr <- my.plotKstats(atac.norm.nmf.exp, "NMF factorization quality metrics")
gg.optKr

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                        Generate river plot                                 ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
river <- generateRiverplot(atac.norm.nmf.exp)
plot(river, plot_area=1, yscale=0.6, nodewidth=0.5)

Minize the Frobenius error, the coefficient of variation and the mean Amari distance, while maximizing the sum and mean silhouette width and the cophenic coefficient.

H Matrix, W normalized:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                        H matrix heatmap annotation                         ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
#Annotation for H matrix heatmap
# Define color vector
type.colVector <- list(Celltype = setNames(atac.annotation$color[match(levels(atac.annotation$Celltype), atac.annotation$Celltype)],
         levels(atac.annotation$Celltype))
)

# Build Heatmap annotation
heat.anno <- HeatmapAnnotation(df = data.frame(Celltype = atac.annotation$Celltype),
                               col = type.colVector,
                               show_annotation_name = TRUE, na_col = "white")


##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Generate H matrix heatmap, W normalized                       ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
for(ki in names(atac.norm.nmf.exp@HMatrixList)) {
  cat("\n")
  cat("  \n#### H matrix for k=",  ki, "  {.tabset}   \n  ")
  #plot H matrix
  tmp.hmatrix <- HMatrix(atac.norm.nmf.exp, k = ki)
  colnames(tmp.hmatrix) <- colnames(atac.norm.nmf.exp)
  h.heatmap <- Heatmap(tmp.hmatrix,
                       col = viridis(100),
                       name = "Exposure",
                       clustering_distance_columns = 'pearson',
                       show_column_dend = TRUE,
                       heatmap_legend_param = 
                         list(color_bar = "continuous", legend_height=unit(2, "cm")),
                       top_annotation = heat.anno,
                       show_column_names = FALSE,
                       show_row_names = FALSE,
                       cluster_rows = FALSE)
  print(h.heatmap)
  
  cat("  \n Recovery plots for k=",  ki, "  \n  ")
  
  recovery_plot(tmp.hmatrix, atac.annotation, "Celltype")
  
  }

H matrix for k= 8


Recovery plots for k= 8

Bcell

CD4

CD8

CLP

CMP

GMP

HSC

LMPP

MEP

Mono

MPP

NK

H matrix for k= 9


Recovery plots for k= 9

Bcell

CD4

CD8

CLP

CMP

GMP

HSC

LMPP

MEP

Mono

MPP

NK

H matrix for k= 10


Recovery plots for k= 10

Bcell

CD4

CD8

CLP

CMP

GMP

HSC

LMPP

MEP

Mono

MPP

NK

gc()
       used  (Mb) gc trigger   (Mb)  max used   (Mb)

Ncells 11096612 592.7 18696788 998.6 18696788 998.6 Vcells 73660253 562.0 131581271 1003.9 131581269 1003.9

Integrative RNAseq & ATACseq

Gene expression (RNAseq) & Chromatin accessibility (ATACseq)

Only the those samples with RNAseq and ATACseq data were used in the integrative analysis.

Data loading

Read normalized gene expression matrix and chromatin accessibility matrix…

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                         Read normalized data                               ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
# read normalized matrix
int.norm.mat <- readRDS("data/multiview/multiview_norm_mat_list.RDS")
int.annotation <- readRDS("data/multiview/multiview_annotation.RDS")

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                          Print dataset dimension                           ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
cat("Dimension of transcriptome dataset (RNAseq):  \n\n  ") 

Dimension of transcriptome dataset (RNAseq):

kable(data.frame(dim(int.norm.mat$rna), row.names = c("features", "samples")), 
      col.names = "") 
features 21811
samples 24
cat("Dimension of Chromatin accessibility dataset (ATACseq):  \n\n  ") 

Dimension of Chromatin accessibility dataset (ATACseq):

kable(data.frame(dim(int.norm.mat$atac), row.names = c("features", "samples")), 
      col.names = "") 
features 123102
samples 24

Applying integrative NMF

Applying Integrative Non-Negative Matrix Factorization (NMF) to normalized Gene expression (RNAseq) and Chromatin accessibility data (ATACseq)

Factorization parameters:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Parameters to run NMF in GPUs using  pythonCuda               ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
k.min <- 8
k.max <- 10
outer.iter <- 5
inner.iter <- 2*10^4

kable(data.frame(c(k.min, k.max, outer.iter, inner.iter), 
                 row.names = c("k.min", "k.max", "outer.iter", "inner.iter")), 
      col.names = "") 
k.min 8
k.max 10
outer.iter 5
inner.iter 20000
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Create nmf experiment object and run NMF                      ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
int.nmf.exp <- run_integrative_NMF_tensor(int.norm.mat, 
                                          k_min = k.min, 
                                          k_max = k.max, 
                                          outer_iter = outer.iter, 
                                          inner_iter = inner.iter, 
                                          conver_stop_threshold = 40, 
                                          lambda=0.7)
## Running integrative NMF for views:  rna,atac 
## [1] "2019-04-02 10:22:38 UTC"
## Factorization rank:  8 
## [1] "NMF converged after  158,240,140,122,208 iterations"
## [1] "2019-04-02 10:23:11 UTC"
## Factorization rank:  9 
## [1] "NMF converged after  142,220,405,191,108 iterations"
## [1] "2019-04-02 10:23:47 UTC"
## Factorization rank:  10 
## [1] "NMF converged after  180,141,179,149,182 iterations"

Factorization quality metrics and optimal K

Based on the results of the factorization quality metrics, an optimal number of signatures (k) must be chosen:

RNAseq

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                RNAseq - Plot K stats - River plot                          ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

## Plot K stats
gg.optKr <- my.plotKstats(int.nmf.exp@view_specific_NMFexp_list$rna, "RNAseq factorization quality metrics")
gg.optKr

## Generate river plot
river <- generateRiverplot(int.nmf.exp@view_specific_NMFexp_list$rna)
plot(river, plot_area=1, yscale=0.6, nodewidth=0.5)

ATACseq

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                ATACseq - Plot K stats - River plot                         ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##

## Plot K stats
gg.optKr <- my.plotKstats(int.nmf.exp@view_specific_NMFexp_list$atac, "ATACseq factorization quality metrics")
gg.optKr

## Generate river plot
river <- generateRiverplot(int.nmf.exp@view_specific_NMFexp_list$atac)
plot(river, plot_area=1, yscale=0.6, nodewidth=0.5)

Minize the Frobenius error, the coefficient of variation and the mean Amari distance, while maximizing the sum and mean silhouette width and the cophenic coefficient.

H Matrices:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                        H matrix heatmap annotation                         ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
#Annotation for H matrix heatmap
# Define color vector
type.colVector <- list(Celltype = setNames(int.annotation$color[match(levels(int.annotation$Celltype), int.annotation$Celltype)],
         levels(int.annotation$Celltype))
)

# Build Heatmap annotation
heat.anno <- HeatmapAnnotation(df = data.frame(Celltype = int.annotation$Celltype),
                               col = type.colVector,
                               show_annotation_name = FALSE, na_col = "white")


##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Generate H matrix heatmap, W normalized                       ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
for(ki in k.min:k.max) {
  cat("\n")
  #cat("  \n#### H matrix for k=",  ki, "  {.tabset}   \n  ")
  cat("  \n#### H matrix for k=",  ki, "  {.tabset}    \n  ")
  #plot H matrix
  cat("\n")
  cat("  \n##### not scaled  \n  ")
  plotH_oneView(int.nmf.exp, k = ki, heat.anno = heat.anno, 
                scale_color = FALSE, viewID = "rna", displayID = "RNAseq")
  plotH_oneView(int.nmf.exp, k = ki, heat.anno = heat.anno, 
                scale_color = FALSE, viewID = "atac", displayID = "ATACseq")
  
  cat("\n")
  cat("  \n##### scaled  \n  ")
  plotH_oneView(int.nmf.exp, k = ki, heat.anno = heat.anno, 
                scale_color = TRUE, viewID = "rna", displayID = "RNAseq")
  plotH_oneView(int.nmf.exp, k = ki, heat.anno = heat.anno, 
                scale_color = TRUE, viewID = "atac", displayID = "ATACseq")
  
  }

H matrix for k= 8

not scaled

scaled

H matrix for k= 9

not scaled

scaled

H matrix for k= 10

not scaled

scaled

Gene set enrichment analysis

Using the feature exposure extracted from the W matrix, a gene set enrichment analysis is perform agains all MSigDB terms

The optimal factorization rank selected was: K = 9

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##                              W matrix Z scores                             ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
idx <- int.nmf.exp@best_factorization_idx
idx <- idx[names(idx) == "k9"]

int.rna.wmatrix <- int.nmf.exp@view_specific_WMatrix_list$k9[[idx]]$rna
rownames(int.rna.wmatrix) <- rownames(int.norm.mat$rna)

#Zscore for each signature
int.rna.wmatrix.zscores <- apply(int.rna.wmatrix, MARGIN=2, function(wmat_score){
  (wmat_score - median(wmat_score)) / mad(wmat_score)
})
colnames(int.rna.wmatrix.zscores) <- paste0("Zscore_Sign", 1:9)


##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##         GAGE (Generally Applicable Gene-set Enrichment) analysis           ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
#Infer gene sets tha are significantly pertubed relative to all genes considered
#load precompiled GSEA MSigDB gene sets
gs.msigDB <- readList("db/msigdb.v6.2.symbols.gmt")
# head(gs.msigDB)

#run GAGE analysis
int.rna.msigDB.enrichment <- gage(int.rna.wmatrix.zscores, gsets=gs.msigDB, same.dir=TRUE)

#Drop NAs for upregulated
int.rna.msigDB.enrichment <- as.data.frame(int.rna.msigDB.enrichment$greater)
int.rna.msigDB.enrichment <- int.rna.msigDB.enrichment[!is.na(int.rna.msigDB.enrichment$p.geomean),]
int.rna.msigDB.enrichment <- int.rna.msigDB.enrichment[, paste0("Zscore_Sign", 1:9)]

# Select only more enriched terms in one signature compared to the others
idx <- apply(int.rna.msigDB.enrichment, 1, function(term){
  term <- -log10(term)
  # Change 0 to small value to avoid NAs
  term[term == 0] <- 1e-40
  # find if this term is more enriched in one signature compared to others
  is.enrich <- sapply(term, function(x){
    # p-value 5 times greater than at least 5 other signatures
    sum(x/term > 5) > 5
  })
  any(is.enrich)
})

int.rna.msigDB.enrichment <- int.rna.msigDB.enrichment[idx,]

# Print table
datatable(int.rna.msigDB.enrichment, filter="top",
          extensions = 'Buttons',
          options = list(dom = 'Bfrtip',
                         buttons = list(list(extend = 'collection',
                                             buttons = c('excel', 'csv'),
                                             text = 'DOWNLOAD DATA')))) %>%
  formatSignif(columns=colnames(int.rna.msigDB.enrichment), digits=3)

iNMF Recovery plots:

##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
##              Generate H matrix recovery plots                              ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
rownames(int.annotation) <- int.annotation$sampleID
int.annotation$Celltype <- factor(int.annotation$Celltype, levels = unique(int.annotation$Celltype))

for(ki in k.min:k.max) {
  cat("\n")
  #cat("  \n#### H matrix for k=",  ki, "  {.tabset}   \n  ")
  cat("  \n### Recovery plots for k=",  ki, "   {.tabset}      \n  ")
  #plot H matrix
  
  # Find best shared matrix, according to factorization metrics
  k <- paste0("k", ki)
  top_idx <- int.nmf.exp@best_factorization_idx
  idx <- top_idx[match(k, names(top_idx))]
  
  cat("\n")
  cat("  \n#### Shared H Matrix      \n  ")
  sharedH <- int.nmf.exp@shared_HMatrix_list[[k]][[idx]]
  colnames(sharedH) <- int.annotation$sampleID
  # Make recovery plots
  recovery_plot(sharedH, int.annotation, "Celltype")
  
  cat("\n")
  cat("  \n#### RNAseq H Matrix      \n  ")
  viewH <- int.nmf.exp@view_specific_HMatrix_list[[k]][[idx]]$rna
  colnames(viewH) <- int.annotation$sampleID
  # Make recovery plots
  recovery_plot(viewH, int.annotation, "Celltype")
  
  cat("\n")
  cat("  \n#### ATACseq H Matrix      \n  ")
  viewH <- int.nmf.exp@view_specific_HMatrix_list[[k]][[idx]]$atac
  colnames(viewH) <- int.annotation$sampleID
  # Make recovery plots
  recovery_plot(viewH, int.annotation, "Celltype")
  
  }

Recovery plots for k= 8

Shared H Matrix

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP

RNAseq H Matrix

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP

ATACseq H Matrix

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP

Recovery plots for k= 9

Shared H Matrix

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP

RNAseq H Matrix

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP

ATACseq H Matrix

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP

Recovery plots for k= 10

Shared H Matrix

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP

RNAseq H Matrix

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP

ATACseq H Matrix

HSC

MPP

LMPP

CMP

GMP

MEP

Mono

CD4

CD8

NK

Bcell

CLP